/* 

Paper: Gentrification and pioneer businesses 
Authors: Behrens, Boulam, Martin, Mayneris 

Name dofile: create_db_table5.do  
Version: 16 nov. 2021 

Output: predict_gentri_database_philly.dta

Inputs:

- philly_1990_2000_2010_FULL.dta  (authors' creation from NETS and GIS)
- naics5.dta (authors computation, publicly available)
- naics4.dta (authors computation, publicly available)
- pioneers_final_negbin.dta (from identify_pioneers_negbin.do)
- growth_est_9000.dta (authors computation, publicly available data)
- id_conc_database_philly.dta (from create_db_concordedblocks_philly.do)
- gentri_dum_id_conc_philly.dta (from identify_gentri_blocks_philly.do)
- id_conc_disk_500m_philly.dta.dta (GIS, publicly available)
- id_conc_distance_nearest_subway_philly.dta (GIS, publicly available)
- distance_idconc_water_philly.dta (GIS, publicly available)
- distance_idconc_parks_philly.dta (GIS, publicly available)
- landmarks_id_conc_count_philly.dta (GIS, publicly available)
- id_conc_pop_income_philly.dta (from prepare_variables4reg_philly.do)
- blocks_couldgentrify_philly.dta (from identify_gentri_blocks_philly.do)
*/ 

********************************************
*Creation of plants counts at the ring-level
********************************************
use philly_1990_2000_2010_FULL, clear

drop if id_conc==.|year==.

tostring naics, replace

merge m:1 naics using naics5
replace naics=naics02 if _merge==3
drop if _merge==2
drop _merge

merge m:1 naics using naics4
replace naics=naics02 if _merge==3
drop if _merge==2
drop _merge

gen toto=length(naics)
tab toto
keep if toto==6
drop toto

*Merge with information on pioneer sectors
merge m:1 naics using pioneers_final_negbin.dta
rename pioneer_high_negbin pioneer
replace pioneer=0 if _merge==1
drop _merge 


*We merge with the sectoral growth rates for the Bartik instruments 
*(note that some sectors, mainly in the construction sector, are in 
*the NY Nets but not in the sect growth rates datasets)

destring naics, replace
merge m:1 naics using growth_est_9000.dta
drop if _merge==2
drop _merge

gen plants=1 

gen plants_instr=(1+g_total_9000) if year==1990 

bys id_conc year: egen plants_pion=sum(plants) if pioneer==1
bys id_conc year: egen var=max(plants_pion) 
replace plants_pion=var if pioneer==0
replace plants_pion=0 if pioneer==0&var==.
drop var

bys id_conc year: egen plants_npion=sum(plants) if pioneer==0
bys id_conc year: egen var=max(plants_npion) 
replace plants_npion=var if pioneer==1
replace plants_npion=0 if pioneer==1&var==.
drop var

bys id_conc year: egen plants_pion_instr=sum(plants_instr) if pioneer==1
bys id_conc year: egen var=max(plants_pion_instr) 
replace plants_pion_instr=var if pioneer==0
replace plants_pion_instr=0 if pioneer==0&var==.
drop var

keep id_conc year plants_pion plants_npion plants_pion_instr 

duplicates drop id_conc year, force

egen t=group(year)

fillin id_conc t

bys t: egen var=max(year)
replace year=var if year==.
drop var 

foreach v of var plants_* {
replace `v'=0 if `v'==.
}

*For the instrument in 2000, it is the value in 1990 that we must take in reality (stock 1990*growth rate)
tsset id_conc t

replace plants_pion_instr=l.plants_pion_instr if t==2
replace plants_pion_instr=. if t==1|t==3


drop _fillin

saveold pioneer_estab_philly.dta, replace v(13)

use philly_1990_2000_2010_FULL, clear

drop if id_conc==.|year==.

tostring naics, replace

merge m:1 naics using naics5
replace naics=naics02 if _merge==3
drop if _merge==2
drop _merge

merge m:1 naics using naics4
replace naics=naics02 if _merge==3
drop if _merge==2
drop _merge

gen toto=length(naics)
tab toto
keep if toto==6
drop toto

gen plants=1

collapse (sum) plants, by(id_conc year)

gen t=1 if year==1990
replace t=2 if year==2000
replace t=3 if year==2010

merge 1:1 id_conc t using id_conc_database_philly
replace plants=0 if _m==2

keep id_conc t year plants

merge 1:1 id_conc t using pioneer_estab_philly.dta
drop if _m==2 

foreach v of var plants_pion plants_npion {
replace `v'=0 if _m==1
}

replace plants_pion_instr=0 if _m==1&t==2

drop _m 

gen plants_tot_pion=plants_npion+plants_pion

pwcorr plants plants_tot
*Correlation equal to 1

drop plants_tot_pion

bys t: egen var=max(year)
replace year=var if year==.
drop var

saveold plants_id_conc_philly.dta, v(13) replace

erase pioneer_estab_philly.dta
 

**************************************************
*Minimum distance to a 1990-2000 gentrifying block
**************************************************

use id_conc_database_philly, clear
keep if dis_ws<=30
keep id_conc intplat intplon
duplicates drop id_conc, force
saveold temp, replace v(13)

use gentri_dum_id_conc_philly.dta, clear 
keep if  gentri1990==1
duplicates drop id_conc, force 

keep id_conc 
merge 1:1 id_conc using temp

keep if _m==3

drop _m

rename intplat intplat_gentri
rename intplon intplon_gentri
rename id_conc id_conc_gentri
gen var=1
 
saveold coord_gentri_9000, replace v(13)

use temp, clear
gen var=1
joinby var using coord_gentri_9000

gen var2 = acos(sin((intplat10*_pi)/180) * sin((intplat_gentri*_pi)/180) + cos(abs((intplon10*_pi)/180- (intplon_gentri*_pi)/180)) * cos(intplat10*_pi/180) * cos(intplat_gentri*_pi/180)) * 6378.137

collapse (min) dis_gentri9000=var2 (mean) intplat10 intplon10, by(id_conc)
keep id_conc dis_gentri9000 intplat intplon
label var dis_gentri9000 "Min. distance to gentrifying block 9000"
saveold dis_gentri9000_philly, replace v(13)

erase temp.dta 

********************************************
*# pioneers (and their IV) at the ring-level
********************************************

use plants_id_conc_philly.dta, clear
rename id_conc id_conc_disc500

joinby id_conc_disc500 using id_conc_disk_500m_philly.dta

collapse (sum) plants_pion plants_pion_instr plants_npion, by(id_conc t)

foreach v of var plants*{
rename `v' `v'500
}

saveold plants_id_conc500_philly.dta,replace v(13)

*******************
*Distance to subway
*******************

use id_conc_distance_nearest_subway_philly, clear 
keep id_conc distance 
duplicates drop id_conc, force // 63 duplicated id_conc - same distance 
saveold subway, replace v(13) 

******************
*Distance to water
******************

use distance_idconc_water_philly, clear
keep id_conc distance 
duplicates drop id_conc, force 
saveold water, replace v(13)

******************
*Distance to parks
******************

use distance_idconc_parks_philly, clear
keep id_conc distance_parks
destring distance_parks, g(dis_park) dpcomma
duplicates drop id_conc, force 
saveold dist_parks, replace v(13)

************
*# landmarks
************

use landmarks_id_conc_count_philly, clear
rename id_conc id_conc_disc500

joinby id_conc_disc500 using id_conc_disk_500m_philly.dta

collapse (sum) pointlm_count500=pointlm_count, by(id_conc)

saveold lm_count, replace v(13)


**************************************
*We create the database for prediction
**************************************

use id_conc_pop_income_philly , clear 

merge m:1 id_conc using dis_gentri9000_philly
drop if _m==2
drop _m
erase dis_gentri9000_philly.dta 

merge 1:1 id_conc t using plants_id_conc500_philly.dta
drop if _m==2
drop _m
erase plants_id_conc500_philly.dta

merge m:1 id_conc using water
drop if _m==2
drop _m
erase water.dta 

merge m:1 id_conc using subway
drop if _m==2
drop _m
erase subway.dta 

merge m:1 id_conc using dist_parks
drop if _m==2
drop _m
erase dist_parks.dta 

merge m:1 id_conc using lm_count
drop if _m==2
drop _m
erase lm_count.dta 

*Creation of adequate dummies and log variables

g ldis_sub=log(distance_public_transit)
g waterfront=distance_water<0.2
gen ldis_gentri9000=log(dis_gentri9000*1000+1)
gen ldis_parks=log(dis_park*1000+1)
gen lpointlm_count500=log(pointlm_count500+1)

*Creation of various census explanatory variable

foreach v of var black asian white other edu {
gen sh_`v'500=`v'500/pop500
}

gen age_build500=1990-med_yearb`n' if t==1 
replace age_build500=2000-med_yearb`n' if t==2
replace age_build500=2010-med_yearb`n' if t==3

*Creation of proxies for gentrification used as dependent variables

tsset id_conc t

gen dsh_edu=f.sh_edu-sh_edu
gen dlpc_inc=f.lpc_income-lpc_income

*Creation of pioneer explanatory variables and instruments

gen lplants_npion500=log(1+plants_npion500)


foreach v of var plants_pion500 lplants_npion500{
gen d`v'=`v'-l.`v'
}

gen dplants_pion_instr500=plants_pion_instr500-l.plants_pion500

*Creation of lagged proxies for gentrification

tsset id_conc t
gen l_dlpc_inc500=lpc_income500-l.lpc_income500
gen l_dsh_edu500=sh_edu500-l.sh_edu500
label var l_dlpc_inc500 "Lag $\Delta$ Ln per cap. income"
label var l_dsh_edu500 "Lag $\Delta$ Share college edu."


label var ldis_sub "Distance to subway (log)"
label var waterfront "Less than 200m fron waterfront"
label var ldis_park "Distance to closest park (log)"

label var lpc_income500 "Ln per cap. income"
label var lpop500 "Ln pop."
label var sh_edu500 "Share college edu. resid."
label var sh_black500 "Share black resid."
label var sh_asian500 "Share asian resid."
label var sh_white500 "Share white resid."
label var sh_other500 "Share other resid."
label var dplants_pion500 "$\Delta$ \# pioneer plants"
label var dlplants_npion500 "$\Delta$ Ln (1+ \# non pioneer plants)"
label var age_build500 "Median age of buildings"
label var lpointlm_count500 "Ln \# of main landmarks"

label var ldis_gentri9000 "Ln dist. to closest gentrifying block 1990-2000"

merge m:1 id_conc using blocks_couldgentrify_philly.dta
keep if (poor_9000==1 |   poor_0010==1) &_m==3
drop _m

saveold predict_gentri_database_philly, replace v(13)


